pRactice corner: ANOVA to compare sample means

lruolin

Data

The data used below was sourced from [Easy Food Science for Statistics]https://www.sciencedirect.com/book/9780128142622/easy-statistics-for-food-science-with-r

Objective

To carry out ANOVA to compare the GI for different types of yellow noodles prepared using partial substitution of wheat flour with banana pulp or peel flour.
GI is glycemic index, which is a measure of the blood glucose raising ability of the available carbohydrate in foods, and is defined as the incremental area under the glycemic response curve elicited by a portion of food containing 50g of availalbe carbohydrate expressed as a percentage of the area under curve elicited by a 50g glucose in the same subject (Wolever et al., “Measuring the Glycemic Index of Foods.”). In my other life, I was involved in clinical studies looking at glycemic index of foods in Singapore. I vividly remember preparing 50 g of sugar solutions, doing lab tests to calculate what is the equivalent to 50g of total available carbohydrates, as well as running IAUC calculations on Excel before I got to know about SPSS. Nevertheless, it was an enriching journey as a research officer!

Load Packages

library(pacman)
p_load(tidyverse, janitor, datapasta, ggthemes, broom, gridExtra, ggstatsplot,
       GGally)

Load Data

data <- tibble::tribble(
                ~Noodles, ~GI_1, ~GI_2, ~GI_3,
          "conventional", 52.83, 53.02, 52.85,
   "ripe cavendish pulp",  48.8, 49.05, 48.65,
   "ripe cavendish peel", 50.66, 50.64, 50.95,
  "green cavendish pulp", 48.41, 47.87, 48.63,
  "green cavendish peel", 51.49, 51.41, 48.09,
       "ripe dream pulp", 47.99, 47.73, 49.36,
       "ripe dream peel",  49.6, 49.36, 49.09,
      "green dream pulp", 47.26, 47.88,  47.7,
      "green dream peel", 52.52, 52.55, 52.54
  ) %>% 
    mutate(noodles_abbrev =factor(Noodles,
                                     levels = c("conventional",
                                     "ripe cavendish pulp",
                                     "ripe cavendish peel",
                                     "green cavendish pulp",
                                     "green cavendish peel",
                                     "ripe dream pulp",
                                     "ripe dream peel",
                                     "green dream pulp",
                                     "green dream peel"),
         labels = c("control", "rcpulp", "rcpeel", "gcpulp", "gcpeel",
                     "rdpulp", "rdpeel", "gdpulp",
                    "gdpeel"))) 

# data without control
data_no_control <- data %>% 
  filter(Noodles != "conventional") %>% 
    # create additional columns in case I want to look at effect of species/ripeness/part later
  mutate(noodles2 = Noodles) %>% 
  separate(noodles2, c("ripe_unripe", "species", "part"))

glimpse(data_no_control)

Rows: 8
Columns: 8
$ Noodles        <chr> "ripe cavendish pulp", "ripe cavendish peel",…
$ GI_1           <dbl> 48.80, 50.66, 48.41, 51.49, 47.99, 49.60, 47.…
$ GI_2           <dbl> 49.05, 50.64, 47.87, 51.41, 47.73, 49.36, 47.…
$ GI_3           <dbl> 48.65, 50.95, 48.63, 48.09, 49.36, 49.09, 47.…
$ noodles_abbrev <fct> rcpulp, rcpeel, gcpulp, gcpeel, rdpulp, rdpee…
$ ripe_unripe    <chr> "ripe", "ripe", "green", "green", "ripe", "ri…
$ species        <chr> "cavendish", "cavendish", "cavendish", "caven…
$ part           <chr> "pulp", "peel", "pulp", "peel", "pulp", "peel…

Visualization

data_long <- data %>% 
  pivot_longer(cols = starts_with("GI"),
               names_to = "reading",
               values_to = "gi")


glimpse(data_long)

Rows: 27
Columns: 4
$ Noodles        <chr> "conventional", "conventional", "conventional…
$ noodles_abbrev <fct> control, control, control, rcpulp, rcpulp, rc…
$ reading        <chr> "GI_1", "GI_2", "GI_3", "GI_1", "GI_2", "GI_3…
$ gi             <dbl> 52.83, 53.02, 52.85, 48.80, 49.05, 48.65, 50.…

summary(data_long)

   Noodles          noodles_abbrev   reading                gi       
 Length:27          control:3      Length:27          Min.   :47.26  
 Class :character   rcpulp :3      Class :character   1st Qu.:48.25  
 Mode  :character   rcpeel :3      Mode  :character   Median :49.36  
                    gcpulp :3                         Mean   :49.89  
                    gcpeel :3                         3rd Qu.:51.45  
                    rdpulp :3                         Max.   :53.02  
                    (Other):9

fig_anova <- data_long%>% 
  group_by(Noodles) %>% 
  summarise(mean_gi = mean(gi)) %>% 
  ggplot(aes(x = fct_reorder(Noodles, mean_gi), y = mean_gi)) +
  geom_boxplot() +
  geom_text(aes(label = round(mean_gi, 2)), hjust = -0.25) +
  scale_y_continuous(limits = c(47, 55)) +
  labs(title = "Comparing GI of different noodles",
       subtitle = "Conventional noodles had the highest GI, and flours made with peel have higher GI than that from pulp.",
       x = "",
       y = "Mean GI (Average of Triplicates)",
       caption = "Source: Easy Statistics for Food Science with R") +
  coord_flip() +
  theme_clean()

# 1 way anova:

m1_aov <- aov(gi ~ noodles_abbrev, data = data_long)
summary(m1_aov)

               Df Sum Sq Mean Sq F value   Pr(>F)    
noodles_abbrev  8  85.34  10.668   19.46 2.26e-07 ***
Residuals      18   9.87   0.548                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Post-hoc comparison

fig_post_hoc <- TukeyHSD(m1_aov, which = "noodles_abbrev", ordered = F) %>% 
  tidy() %>% 
  filter(adj.p.value<0.05) %>% 
  mutate(high_low = case_when(estimate >=0 ~ "higher",
                             TRUE ~ "lower")) %>% 
  ggplot(aes(x = fct_reorder(contrast, estimate), y = estimate)) +
  geom_point(size = 3, aes(col = high_low)) +
  geom_errorbar(aes(ymin = conf.low,
                    ymax = conf.high, col = high_low), width = 0.5) +
  coord_flip() +
  labs(title = "95% family-wise confidence level for groups with p<0.05",
       x = "",
       y = "Differences in mean levels of noodles") +
  theme_clean() +
  theme(legend.position = "none")

grid.arrange(fig_anova, fig_post_hoc)

The results below are slightly different due to different post-hoc analysis.

# another way, using ggstatsplot

ggbetweenstats(
  data = data_long,
  x = noodles_abbrev,
  y = gi,
  plot.type = "box",
  type = "p",
  package = "ggsci",
  palette = "nrc_npg",
  pairwise.display = "significant",
  title = "Comparing GI for different noodles"
)

Confounding?

Let’s look at the effect of ripeness, variety and part on GI.

glimpse(data_no_control)

Rows: 8
Columns: 8
$ Noodles        <chr> "ripe cavendish pulp", "ripe cavendish peel",…
$ GI_1           <dbl> 48.80, 50.66, 48.41, 51.49, 47.99, 49.60, 47.…
$ GI_2           <dbl> 49.05, 50.64, 47.87, 51.41, 47.73, 49.36, 47.…
$ GI_3           <dbl> 48.65, 50.95, 48.63, 48.09, 49.36, 49.09, 47.…
$ noodles_abbrev <fct> rcpulp, rcpeel, gcpulp, gcpeel, rdpulp, rdpee…
$ ripe_unripe    <chr> "ripe", "ripe", "green", "green", "ripe", "ri…
$ species        <chr> "cavendish", "cavendish", "cavendish", "caven…
$ part           <chr> "pulp", "peel", "pulp", "peel", "pulp", "peel…

data_long_no_control <- data_no_control %>% 
  pivot_longer(cols = starts_with("GI"),
               names_to = "replicates",
               values_to = "gi")

# main effect
m2_aov <- aov(gi ~ ripe_unripe + species + part, data = data_long_no_control)
summary(m2_aov) # residuals reduced compared to m2_aov

            Df Sum Sq Mean Sq F value   Pr(>F)    
ripe_unripe  1   0.83    0.83   0.612    0.443    
species      1   0.05    0.05   0.035    0.853    
part         1  36.43   36.43  26.767 4.62e-05 ***
Residuals   20  27.22    1.36                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# interaction

m3_aov <- aov(gi ~ ripe_unripe * species * part, data = data_long_no_control)
summary(m3_aov)

                         Df Sum Sq Mean Sq F value   Pr(>F)    
ripe_unripe               1   0.83    0.83   1.353  0.26181    
species                   1   0.05    0.05   0.078  0.78424    
part                      1  36.43   36.43  59.208 9.15e-07 ***
ripe_unripe:species       1   4.31    4.31   7.004  0.01760 *  
ripe_unripe:part          1   6.13    6.13   9.963  0.00611 ** 
species:part              1   1.46    1.46   2.365  0.14361    
ripe_unripe:species:part  1   5.48    5.48   8.909  0.00875 ** 
Residuals                16   9.85    0.62                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# testing the models

anova(m2_aov, m3_aov) # p -s significant so interaction term is not by chance.

Analysis of Variance Table

Model 1: gi ~ ripe_unripe + species + part
Model 2: gi ~ ripe_unripe * species * part
  Res.Df     RSS Df Sum of Sq      F   Pr(>F)   
1     20 27.2226                                
2     16  9.8453  4    17.377 7.0602 0.001787 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Checking assumptions --

# to check for homogeneity of variance
library(car)
leveneTest(gi ~ ripe_unripe*species*part, data = data_long_no_control)  # p> 0.05, so equal variance can be assumed

Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group  7  0.7401 0.6424
      16

# to check whether residuals are normally distributed
m3_aov$residuals %>% 
  as_tibble() %>% 
  ggplot(aes(value)) +
  geom_histogram() +
  theme_clean() # slightly right-skewed

From the ANOVA results, the part of banana used (pulp or peel) has an effect on GI.

There are also interactions between ripeness:species, ripeness: part.

Visualizing interaction

fig_a <- data_long_no_control %>% 
  ggplot(aes(x = ripe_unripe, y = gi)) +
  geom_boxplot(aes(fill = species)) +
  labs(title = "Ripeness:Species Interaction",
       x = "Green/Ripe",
       y = "GI noodles",
       fill = "Species") +
  theme_clean()

fig_b <- data_long_no_control %>% 
  ggplot(aes(x = ripe_unripe, y = gi)) +
  geom_boxplot(aes(fill = part)) +
  labs(title = "Ripeness:Part Interaction",
       x = "Green/Ripe",
       y = "GI",
       fill = "Part") +
  theme_clean()

fig_c <- data_long_no_control %>% 
  ggplot(aes(x = part, y = gi)) +
  geom_boxplot(aes(fill = species)) +
  labs(title = "Part:Species Interaction",
       x = "Part",
       y = "GI") +
  theme_clean()

grid.arrange(fig_a, fig_b, fig_c, ncol = 2)

Learning points

It may be worthwhile to consider each type of noodles as a combination of three variables: ripeness, part and species to have more rigorous understanding of the ANOVA results.
When carrying out ANOVA, one needs to check whether the model meets normality assumptions
Visualizing post-hoc t-tests using the tidyverse way (and being able to filter out significant terms)
aov (ANOVA) tells you what are the factors that are important. The contribution of each term comes from lm (by looking at the coefficients).

Comment on this article Share:

ANOVA to compare sample means